In [37]:
import time
import datetime as dt
import pickle
import numpy as np
import random
import scipy as sp
from dict_stops import *
import pandas as pd
import os
import csv
from geopy.distance import vincenty
In [5]:
# Función que estandariza los valores de los paraderos de subida
# y bajada
def update_vals(row,data = load_metro_dictionary()):
if row.par_subida in data:
row.par_subida = data[row.par_subida]
if row.par_bajada in data:
row.par_bajada = data[row.par_bajada]
return row
In [6]:
# Función que estandariza los valores de los paraderos de subida
# y bajada
def add_vals(row,latlong,paradero,data = dict_latlong_stops):
stop_name = row[paradero]
if stop_name in data:
return data[stop_name][latlong]
else :
return np.nan
In [7]:
def frame_config(frame):
frame['tiempo_subida'] = pd.to_datetime(frame.tiempo_subida)
frame['tiempo_bajada'] = pd.to_datetime(frame.tiempo_bajada)
frame = frame.apply(update_vals, axis=1)
frame['weekday'] = frame.tiempo_subida.dt.dayofweek
frame['lat_subida'] = frame.apply(add_vals,args=('lat','par_subida'),axis=1)
frame['lat_bajada'] = frame.apply(add_vals,args=('lat','par_bajada'),axis=1)
frame['long_subida'] = frame.apply(add_vals,args=('long','par_subida'),axis=1)
frame['long_bajada'] = frame.apply(add_vals,args=('long','par_bajada'),axis=1)
frame = frame.sort_values(by=['id', 'tiempo_subida'])
frame['diferencia_tiempo'] = (frame['tiempo_subida']-frame['tiempo_subida'].shift()).fillna(0)
return frame
In [22]:
def hour_to_seconds(an_hour):
return int(an_hour.hour*3600 + an_hour.minute *60 + an_hour.second)
In [8]:
frame = frame_config(frame)
In [9]:
df_id_period = frame_config(df_id_period)
In [11]:
dframe = frame[['id','tiempo_subida','lat_subida','long_subida','tiempo_bajada','lat_bajada','long_bajada']]
In [12]:
df_id_period = df_id_period[['id','tiempo_subida','lat_subida','long_subida','tiempo_bajada','lat_bajada','long_bajada']]
In [3]:
if os.name == 'nt':
path_subway_dictionary = 'C:\Users\catalina\Documents\Datois\Diccionario-EstacionesMetro.csv'
path_csv_sequences = 'C:\Users\catalina\Documents\sequences\\'
else:
path_subway_dictionary = '/home/cata/Documentos/Datois/Diccionario-EstacionesMetro.csv'
path_csv_sequences = '/home/cata/Documentos/sequences/'
# Función que carga las estaciones de metro
# en un diccionario
def load_metro_dictionary():
dict_metro = {}
with open(path_subway_dictionary,mode='r') as infile:
reader = csv.reader(infile,delimiter=';')
dict_metro = {rows[5]:rows[7] for rows in reader}
return dict_metro
In [2]:
frame = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_abril_allyearsids_10_100000.csv')
In [4]:
df_id_period = pd.read_csv('/home/cata/Documentos/Datois/etapas_2013_septiembre_allyearsids_10_100000.csv')
df_id_period['tiempo_subida'] = pd.to_datetime(df_id_period.tiempo_subida)
df_id_period = df_id_period.sort_values(by=['id', 'tiempo_subida'])
In [13]:
def create_sequence(id_user, mls, nvisitas, sequence):
profile = {'user_id':id_user,'mls':mls,'nvisitas':nvisitas,'sequence':sequence}
return profile
In [14]:
def buscar_locacion(mls,location):
try:
index_location = mls.index(location)
except ValueError:
index_location = -1
return index_location
In [15]:
def get_sequences(ids,lat_subidas,long_subidas,t_subidas,lat_bajadas,long_bajadas,t_bajadas):
# se inicializan las variables con los valores de la primera transaccion
profiles= [] # arreglo de diccionarios
First = True
# inicializo para despues usarlas
last_id = -22
mls = []
nvisitas = []
sequence = []
times = []
counter = 0
for transaction in zip(ids,lat_subidas,long_subidas,t_subidas,lat_bajadas,long_bajadas,t_bajadas):
id_user = transaction[0]
lat_subida = transaction[1]
long_subida = transaction[2]
t_subida = transaction[3]
lat_bajada = transaction[4]
long_bajada = transaction[5]
t_bajada = transaction[6]
counter += 1
if (lat_subida!=lat_subida or t_subida != t_subida):
continue
par_subida = (lat_subida,long_subida)
par_bajada = (lat_bajada,long_bajada)
subida_3 = (lat_subida,long_subida,hour_to_seconds(t_subida))
if First:
last_id = id_user
mls = [par_subida]
sequence = [subida_3]
last_stop = par_subida
times.append(hour_to_seconds(t_subida))
nvisitas = [0]
counter = 1
First = False
if id_user!=last_id:
profiles.append(create_sequence(last_id,mls,nvisitas,sequence))
last_id = id_user
mls = [par_subida]
sequence = [subida_3]
last_stop = par_subida
nvisitas = [0]
counter = 1
index_subida = buscar_locacion(mls,par_subida)
# si la subida no había sido visitada se debe agregar al mls
if (index_subida < 0):
mls.append(par_subida)
nvisitas.append(1)
index_subida = len(mls) - 1
sequence.append(subida_3)
times.append(hour_to_seconds(t_subida))
# si la bajada no se pudo calcular solo se considera la subida y se deja para calcular tpm en la proxima ronda
if (lat_bajada!=lat_bajada or t_bajada != t_bajada):
last_stop = par_subida
#print "Iteración n°: " + str(counter) + " , no se pudo estimar la bajada"
else:
bajada_3 = (lat_bajada,long_bajada,hour_to_seconds(t_bajada))
last_stop = par_bajada
sequence.append(bajada_3)
times.append(hour_to_seconds(t_bajada))
index_bajada = buscar_locacion(mls,par_bajada)
# si la bajada no se había visitado antes, agregar bajada y sumar nvisitas
if (index_bajada < 0):
mls.append(par_bajada)
index_bajada = len(mls)-1
nvisitas.append(1)
# sumar nvisita
else:
nvisitas[index_bajada] = nvisitas[index_bajada]+1
else:
nvisitas[index_subida] = nvisitas[index_subida]+1
if(par_subida!=last_stop):
sequence.append(subida_3)
times.append(hour_to_seconds(t_subida))
# subida estaba de antes y no hay bajada
# REVISAR SI ESTO NO ES REDUNDANTE!
if (lat_bajada!=lat_bajada or t_bajada!=t_bajada):
last_stop = par_subida
# hay subida y bajada
else:
bajada_3 = (lat_bajada,long_bajada,hour_to_seconds(t_bajada))
sequence.append(bajada_3)
times.append(hour_to_seconds(t_bajada))
last_stop = par_bajada
index_bajada = buscar_locacion(mls,par_bajada)
# hay bajada pero no estaba antes
if (index_bajada<0):
mls.append(par_bajada)
index_bajada = len(mls) - 1
nvisitas.append(1)
# subida y bajada estaban de antes
else:
nvisitas[index_bajada] = nvisitas[index_bajada]+1
profiles.append(create_sequence(last_id,mls,nvisitas,sequence))
return profiles
In [23]:
profiles = get_sequences(dframe['id'],dframe['lat_subida'],dframe['long_subida'],dframe['tiempo_subida'],dframe['lat_bajada'],dframe['long_bajada'],dframe['tiempo_bajada'])
In [24]:
profiles_tw2 = get_sequences(df_id_period['id'],df_id_period['lat_subida'],df_id_period['long_subida'],df_id_period['tiempo_subida'],df_id_period['lat_bajada'],df_id_period['long_bajada'],df_id_period['tiempo_bajada'])
In [26]:
def delete(sequence,i,c,sum_lat=0,sum_long=0,sum_temp=0):
n = len(sequence)
if sum_lat == 0:
for seq in sequence:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
lat_distance = (sum_lat/n-(sum_lat-sequence[i][0])/(n-1))**2
long_distance = (sum_long/n-(sum_long-sequence[i][1])/(n-1))**2
temporal_distance = (sum_temp/n-(sum_temp-sequence[i][2])/(n-1))**2
spatial_distance = lat_distance + long_distance
return ((1-c)*spatial_distance+c*temporal_distance)**0.5
In [27]:
def insert(sequence,pi,c,sum_lat=0,sum_long=0,sum_temp=0):
n = len(sequence)
if sum_lat == 0:
for seq in sequence:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
lat_distance = (sum_lat/n-(sum_lat+pi[0])/(n+1))**2
long_distance = (sum_long/n-(sum_long+pi[0])/(n+1))**2
temporal_distance = (sum_temp/n-(sum_temp+pi[0])/(n+1))**2
spatial_distance = lat_distance + long_distance
return ((1-c)*spatial_distance+c*temporal_distance)**0.5
In [28]:
def replace(sequence,pi,pj,c,sum_lat=0,sum_long=0,sum_temp=0):
n = len(sequence)
if sum_lat == 0:
for seq in sequence:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
sum_lat_plus_pj = sum_lat - pi[0] +pj[0]
sum_long_plus_pj = sum_long - pi[1] +pj[1]
sum_temp_plus_pj = sum_temp - pi[2] +pj[2]
lat_distance = (sum_lat/n-sum_lat_plus_pj/n)**2
long_distance = (sum_long/n-sum_long_plus_pj/n)**2
temporal_distance = (sum_temp/n-sum_temp_plus_pj/n)**2
spatial_distance = lat_distance + long_distance
return ((1-c)*spatial_distance+c*temporal_distance)**0.5
In [29]:
def cost(a_tuple):
return a_tuple
In [30]:
# Funcion que compara la similitud entre un perfil y una secuencia de transacciones
# Se normaliza el calculo según el largo de la secuencia
# get_simliarity: [[int]] [string] [string] int int-> int
def get_similarity(sequence_a,sequence_b,c,sum_lat,sum_long,sum_temp):
length_sequence_a = len(sequence_a)
length_sequence_b = len(sequence_b)
D = np.zeros((length_sequence_a+1,length_sequence_b+1))
for i in range(length_sequence_a):
D[i+1,0] = D[i,0] + delete(sequence_a,i,c)
for j in range(length_sequence_b):
D[0,j+1] = D[0,j] + insert(sequence_a,sequence_b[j],c)
for i in range(1,length_sequence_a+1):
for j in range(1,length_sequence_b+1):
m1 = D[i-1,j-1] + replace(sequence_a,sequence_a[i-1],sequence_b[j-1],c,sum_lat,sum_long,sum_temp)
m2 = D[i-1,j] + delete(sequence_a,i-1,c,sum_lat,sum_long,sum_temp)
m3 = D[i,j-1] + insert(sequence_a,sequence_b[j-1],c,sum_lat,sum_long,sum_temp)
D[i,j] = min(m1,m2,m3)
return D[length_sequence_a,length_sequence_b]
# Funcion que construye la matriz de identificacion en que cada indice corresponde
# a la similitud entre la i-esima tpm y la j-esima secuencia, obtenidas a partir de un
# perfil de usuario y un periodo de identificacion.
# len(users_profiles) == len(users_sequences)
# asume que los usuarios de users_profiles y users_sequences son los mismos
# get_identification_matrix; get_profiles(...) get_sequences(...) -> [[int]]
def get_identification_matrix(profiles_tw1,profiles_tw2,c):
i = 0
j = 0
limit = min((len(profiles_tw1),len(profiles_tw2)))
identification_matrix = np.zeros((limit,limit))
for profile_i in profiles_tw1:
sequence_a = profile_i['sequence']
sum_lat = 0
sum_long = 0
sum_temp = 0
for seq in sequence_a:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
length_sequence_a = len(sequence_a)
D_0 = np.zeros((length_sequence_a+1,1))
for n in range(length_sequence_a):
D_0[n+1,0] = D_0[n,0] + delete(sequence_a,n,c)
for profile_j in profiles_tw2:
sequence_b = profile_j['sequence']
length_sequence_b = len(sequence_b)
D = np.zeros((length_sequence_a+1,length_sequence_b+1))
D[:,0] = D_0[:,0]
for s in range(length_sequence_b):
D[0,s+1] = D[0,s] + insert(sequence_a,sequence_b[s],c)
for r in range(1,length_sequence_a+1):
for t in range(1,length_sequence_b+1):
m1 = D[r-1,t-1] + replace(sequence_a,sequence_a[r-1],sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
m2 = D[r-1,t] + delete(sequence_a,r-1,c,sum_lat,sum_long,sum_temp)
m3 = D[r,t-1] + insert(sequence_a,sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
D[r,t] = min(m1,m2,m3)
identification_matrix[i,j] = D[length_sequence_a,length_sequence_b]
j += 1
if(j >= limit):
break
i += 1
j=0
if(i >= limit):
break
return identification_matrix
In [31]:
init_time = time.time()
iden_matrix = get_identification_matrix(profiles[:20],profiles_tw2[:20],0)
print time.time()-init_time
In [32]:
4.89/400
Out[32]:
In [33]:
a = 0
b = 0
for i in range(len(iden_matrix)):
if i == np.argmin(iden_matrix[i,:]):
a += 1
if i == np.argmin(iden_matrix[:,i]):
b +=1
print str(a*100.0/len(iden_matrix))
print str(b*100.0/len(iden_matrix))
In [34]:
def delete_meters(sequence,i,c,sum_lat=0,sum_long=0,sum_temp=0):
n = len(sequence)
if sum_lat == 0:
for seq in sequence:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
original_centroid = (sum_lat/n,sum_long/n)
modified_centroid = ((sum_lat-sequence[i][0])/(n-1),(sum_long-sequence[i][1])/(n-1))
temporal_distance = (sum_temp/n-(sum_temp-sequence[i][2])/(n-1))**2
spatial_distance = vincenty(original_centroid,modified_centroid).meters **2
return ((1-c)*spatial_distance+c*temporal_distance)**0.5
def insert_meters(sequence,pi,c,sum_lat=0,sum_long=0,sum_temp=0):
n = len(sequence)
if sum_lat == 0:
for seq in sequence:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
original_centroid = (sum_lat/n,sum_long/n)
modified_centroid = ((sum_lat+pi[0])/(n+1),(sum_long+pi[0])/(n+1))
temporal_distance = (sum_temp/n-(sum_temp+pi[0])/(n+1))**2
spatial_distance = vincenty(original_centroid,modified_centroid).meters **2
return ((1-c)*spatial_distance+c*temporal_distance)**0.5
def replace_meters(sequence,pi,pj,c,sum_lat=0,sum_long=0,sum_temp=0):
n = len(sequence)
if sum_lat == 0:
for seq in sequence:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
sum_lat_plus_pj = sum_lat - pi[0] +pj[0]
sum_long_plus_pj = sum_long - pi[1] +pj[1]
sum_temp_plus_pj = sum_temp - pi[2] +pj[2]
original_centroid = (sum_lat/n,sum_long/n)
modified_centroid = (sum_lat_plus_pj/n,sum_long_plus_pj/n)
temporal_distance = (sum_temp/n-sum_temp_plus_pj/n)**2
spatial_distance = vincenty(original_centroid,modified_centroid).meters **2
return ((1-c)*spatial_distance+c*temporal_distance)**0.5
In [50]:
# Funcion que construye la matriz de identificacion en que cada indice corresponde
# a la similitud entre la i-esima tpm y la j-esima secuencia, obtenidas a partir de un
# perfil de usuario y un periodo de identificacion.
# len(users_profiles) == len(users_sequences)
# asume que los usuarios de users_profiles y users_sequences son los mismos
# get_identification_matrix; get_profiles(...) get_sequences(...) -> [[int]]
def get_identification_matrix_meters(profiles_tw1,profiles_tw2,c):
i = 0
j = 0
limit = min((len(profiles_tw1),len(profiles_tw2)))
identification_matrix = np.zeros((limit,limit))
for profile_i in profiles_tw1:
sequence_a = profile_i['sequence']
sum_lat = 0
sum_long = 0
sum_temp = 0
for seq in sequence_a:
sum_lat += seq[0]
sum_long += seq[1]
sum_temp += seq[2]
length_sequence_a = len(sequence_a)
D_0 = np.zeros((length_sequence_a+1,1))
for n in range(length_sequence_a):
D_0[n+1,0] = D_0[n,0] + delete_meters(sequence_a,n,c)
for profile_j in profiles_tw2:
sequence_b = profile_j['sequence']
length_sequence_b = len(sequence_b)
D = np.zeros((length_sequence_a+1,length_sequence_b+1))
D[:,0] = D_0[:,0]
for s in range(length_sequence_b):
D[0,s+1] = D[0,s] + insert_meters(sequence_a,sequence_b[s],c)
for r in range(1,length_sequence_a+1):
for t in range(1,length_sequence_b+1):
m1 = D[r-1,t-1] + replace_meters(sequence_a,sequence_a[r-1],sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
m2 = D[r-1,t] + delete_meters(sequence_a,r-1,c,sum_lat,sum_long,sum_temp)
m3 = D[r,t-1] + insert_meters(sequence_a,sequence_b[t-1],c,sum_lat,sum_long,sum_temp)
D[r,t] = min(m1,m2,m3)
identification_matrix[i,j] = D[length_sequence_a,length_sequence_b]
j += 1
if(j >= limit):
break
i += 1
j=0
if(i >= limit):
break
return identification_matrix
In [51]:
init_time = time.time()
iden_matrix_meters = get_identification_matrix_meters(profiles[:100],profiles_tw2[:100],0)
print time.time()-init_time
In [53]:
a = 0
b = 0
for i in range(len(iden_matrix)):
if i == np.argmin(iden_matrix_meters[i,:]):
a += 1
if i == np.argmin(iden_matrix_meters[:,i]):
b +=1
print str(a*100.0/len(iden_matrix))
print str(b*100.0/len(iden_matrix))
In [ ]: